knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(plotly)
library(rstatix)
library(corrplot)
library(ggpubr)
library(GGally)
library(factoextra)
library(pheatmap)
library(FactoMineR)
library(ggbiplot)
library(tidymodels)
library(embed)
library(stats)
library(ggplot2)

#Задание 1

data <- readRDS("life_expectancy_data.RDS")

#Задание 2

# Создание интерактивного графика с использованием 'continent' для раскраски
plot <- plot_ly(data, x = ~`Life expectancy`, y = ~Unemployment, color = ~continent, 
                type = "scatter", mode = "markers", text = ~Gender, marker = list(size = 10)) %>%
  layout(title = "Life Expectancy vs Unemployment",
         xaxis = list(title = "Life Expectancy"),
         yaxis = list(title = "Unemployment"),
         showlegend = TRUE)

# Отображение графика
plot

#Задание 3

filter_data <- subset(data, data$continent %in% c("Africa", "Americas"))

# Тест Манна-Уитни
stat.test <- filter_data %>%
  wilcox_test(`Life expectancy` ~ continent)
stat.test
## # A tibble: 1 × 7
##   .y.             group1 group2      n1    n2 statistic        p
## * <chr>           <chr>  <chr>    <int> <int>     <dbl>    <dbl>
## 1 Life expectancy Africa Americas    52    38       107 6.34e-13
# Создание ящиков с усами
p <- ggboxplot(
  filter_data,
  x = "continent", y = "Life expectancy"
) +
  labs(subtitle = get_test_label(stat.test, detailed = TRUE))

# Визуализация результатов теста Манна-Уитни
p + geom_signif(comparisons = list(c("Africa", "Americas")), 
                map_signif_level = TRUE, 
                textsize = 6, vjust = 0.5)

#Задание 4

# Выбор числовых переменных и исключение Year
data2 <- data %>% 
  select_if(is.numeric) %>%
  select(-Year)

# Корреляционный анализ и построение корреляционного графика с corrplot
cor_plot <- data2 %>%
  select(everything()) %>%
  psych::corr.test(adjust = "BH")

corrplot(corr = cor_plot$r,
         p.mat = cor_plot$p,
         method = "color",
         order = "hclust")

# Визуализация корреляций с помощью ggpairs
cor_plot2 <- ggpairs(data2,
                     title = 'Correlations in dataset',
                     progress = FALSE) +
  theme_minimal() +
  scale_fill_manual(values = c('#69b3a2')) +
  scale_colour_manual(values = c('#69b3a2'))

cor_plot2

#Задание 5

# Масштабирование данных
data2_scaled <- scale(data2)

# Расчет матрицы расстояний
data2_dist <- dist(data2_scaled, method = "euclidean")

# Иерархическая кластеризация
data2_hc <- hclust(d = data2_dist, method = "ward.D2")

# Визуализация дендрограммы
fviz_dend(data2_hc, 
          k = 5,  # количество кластеров
          k_colors = c("#cc0337", "#010d85", "#37953f", "#f98866","#5BC7F2" ),
          cex = 0.1,
          rect = TRUE) +
  guides(color = "none")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Задание6

pheatmap(data2_scaled, 
         clustering_method = "ward.D2", 
         cutree_rows = 5,
         cutree_cols = length(colnames(data2_scaled)),
         angle_col = 90, 
         main = "Dendrograms for clustering rows and columns with heatmap")

#Задание 7

data_pca <- prcomp(data2_scaled) 
summary(data_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     2.7526 1.4841 1.3952 1.17177 1.08375 0.96347 0.9288
## Proportion of Variance 0.3988 0.1159 0.1025 0.07227 0.06182 0.04886 0.0454
## Cumulative Proportion  0.3988 0.5147 0.6172 0.68945 0.75126 0.80012 0.8455
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.85740 0.69263 0.68937 0.59106 0.54986 0.47085 0.36596
## Proportion of Variance 0.03869 0.02525 0.02501 0.01839 0.01591 0.01167 0.00705
## Cumulative Proportion  0.88421 0.90946 0.93447 0.95286 0.96877 0.98044 0.98749
##                           PC15    PC16    PC17    PC18      PC19
## Standard deviation     0.34546 0.26941 0.20224 0.06968 1.012e-15
## Proportion of Variance 0.00628 0.00382 0.00215 0.00026 0.000e+00
## Cumulative Proportion  0.99377 0.99759 0.99974 1.00000 1.000e+00

#Задание8

plot_bi <- ggbiplot(
  data_pca,
  scale = 0,
   groups = as.factor(data$continent),  
  ellipse = TRUE,
  alpha = 0.2,
) + 
  geom_point(
    aes(
      color = data$continent,
      fill = data$Country
    )
  ) +
  theme_minimal()

plotly_1 <- ggplotly(plot_bi)

# Вывести график
plotly_1

#Задание10

umap <- recipe(~., data = data2) %>% 
  step_normalize(all_predictors()) %>% 
  step_umap(all_predictors()) %>% 
  prep() %>% 
  juice() 
umap2 <- cbind(umap, data)

umap2 %>%
  ggplot(aes(UMAP1, UMAP2)) + 
  geom_point(aes(color = continent,
             alpha = 0.7, size = 2)) +
  labs(color = NULL)

#Задание 11

# Удаление 5 случайных колонок
set.seed(123)  # для воспроизводимости
removed_columns <- sample(colnames(data2), 5)
data_after_removal <- data2[, !(colnames(data2) %in% removed_columns)]

# Функция для проведения PCA и получения кумулятивного процента объяснённой вариации
perform_pca <- function(data) {
  pca_result <- prcomp(data, scale. = TRUE)
  cumulative_variance <- cumsum(pca_result$sdev^2) / sum(pca_result$sdev^2)
  return(list(result = pca_result, cumulative_variance = cumulative_variance))
}

# Проведение PCA анализа три раза после удаления 5 случайных колонок
pca_results <- lapply(1:3, function(i) {
  set.seed(123 + i)  # изменение seed для различных итераций
  removed_columns <- sample(colnames(data2), 5)
  data_after_removal <- data2[, !(colnames(data2) %in% removed_columns)]
  perform_pca(data_after_removal)
})

# Визуализация кумулятивного процента объяснённой вариации
par(mfrow = c(1, 3))
for (i in 1:3) {
  plot(pca_results[[i]]$cumulative_variance, type = 'b', main = paste("PCA Run", i),
       xlab = "Number of Principal Components", ylab = "Cumulative Variance Explained")
}